from pathlib import Path
from pyarrow import parquet
from huggingface_hub import login
import json

def prepare():
    # 准备上传数据
    data_dir = Path('/root/autodl-tmp/malay/train')
    out_dir = Path('/root/autodl-tmp/whisper-sft')
    out_dir.mkdir(parents=True, exist_ok=True)
    meta_file = out_dir / 'metadata.jsonl'
    fw_meta = open(meta_file, 'w', encoding='utf-8', newline='')

    for parquet_file in data_dir.glob("*.parquet"):
        pq = parquet.ParquetFile(parquet_file)
        df = pq.read().to_pandas()

        print(parquet_file)
        for idx, row in df.iterrows():
            src_audio_file = out_dir / row['src_audio']['path']
            with open(src_audio_file, 'wb') as fw:
                fw.write(row['src_audio']['bytes'])
            
            trans_audio_file = out_dir / row['trans_audio']['path']
            with open(trans_audio_file, 'wb') as fw:
                fw.write(row['trans_audio']['bytes'])

            json_data = {
                'src_file_name': src_audio_file.name, 
                'src_text': row['src_text'],
                'trans_file_name': src_audio_file.name, 
                'trans_text': row['trans_text'],
            }
            fw_meta.write(json.dumps(json_data, ensure_ascii=False))


def upload():
    # 上传到huggingface
    from datasets import load_dataset
    audio_dataset = load_dataset("audiofolder", data_dir="/root/autodl-tmp/whisper-sft")
    audio_dataset.push_to_hub("creasson/malay_to_chinese", token='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')


if __name__ == '__main__':
    # prepare()
    upload()

        